/*==============================================================================
FILE NAME: Process_Facility_Characteristics.do
INPUT: facility_characteristics.dta
OUTPUT: facility_characteristics_clean.dta
CREATED: 3 September 2025
UPDATED: 23 July 2025
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw facility characteristics data
use "$processed_data/facility_characteristics.dta", clear

// Check for distinct regulated entities
distinct RegulatedEntityNo

// Drop unnecessary columns
drop Q N State PhysicalAddress PhysicalAddress2

// Handle alternate zip code column
rename O ZipCode_alt

// Convert zip code columns to numeric
destring ZipCode ZipCode_alt, replace

// Flag and fill missing zip codes using alternate column
gen ZipFlag = 1 if ZipCode == . & ZipCode_alt != .
replace ZipCode = ZipCode_alt if ZipCode == . & ZipCode_alt != .

// Drop alternate zip code columns
drop ZipCode_alt ZipCodeExt

// Restrict zip codes to valid Texas range
replace ZipCode = . if ZipCode < 73301 
replace ZipCode = . if ZipCode > 88589

// Fill missing city values using NearestCity
replace City = NearestCity if City == "" & NearestCity != ""

// Drop NearestCity column
drop NearestCity

// Extract 2-digit SIC code and convert to numeric
gen SIC_2digit = substr(SIC,1,2)
destring SIC_2digit, replace
drop SIC 

// For each regulated entity, get the most common SIC code
bys RegulatedEntityNo: egen SIC = mode(SIC_2digit)

// Drop temporary SIC_2digit column
drop SIC_2digit

// Remove duplicate rows
duplicates drop 

// Rename columns for clarity
rename RegulatedEntityNo RN
rename CustomerNo CN
rename ProgramIDNo PN

// Save cleaned facility characteristics data
save "$processed_data/facility_characteristics_clean.dta", replace
